Importing library¶
In [15]:
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess_input
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input as vgg_preprocess_input
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import pathlib
import cv2
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import os, sys
import matplotlib.pyplot as plt
import numpy as np
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.models import HoverTool, ColumnDataSource, ImageURL
from bokeh.transform import linear_cmap
from bokeh.palettes import Category10
from bokeh.models import LinearColorMapper
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource, LegendItem, Legend
from bokeh.palettes import Category10
class DataLoader:
def __init__(self, dataset_path, image_size = (224,224), batch_size=32):
# Path to the dataset
self.dataset_path = pathlib.Path(dataset_path)
self.batch_size = batch_size
self.image_size = image_size
self.datagen = ImageDataGenerator(rescale = 1./255,)
#self.data = self.datagen.flow_from_directory(self.dataset_path, target_size = self.image_size, batch_size = self.batch_size, class_mode = 'categorical')
self.data = image_dataset_from_directory(self.dataset_path,
seed = 42,
image_size = self.image_size,
labels='inferred',
shuffle=False,
batch_size=self.batch_size,)
with HiddenPrints():
self.shuffled_data = image_dataset_from_directory(self.dataset_path,
seed = 42,
image_size = self.image_size,
labels='inferred',
shuffle=True,
batch_size=self.batch_size,)
self.labels = self.labels()
self.class_names = self.data.class_names
def class_labels(self):
return list(self.data.class_names)
def labels(self):
labels = []
for images, label_batch in self.data:
labels.extend(label_batch.numpy())
labels = np.array(labels)
return labels
# Function to visualize some images
def visualize(self, num_samples = 5):
plt.figure(figsize = (10,8))
for images,labels in self.shuffled_data.take(1):
for i in range(num_samples):
plt.subplot(1, num_samples, i + 1)
plt.imshow(images[i]/255.0)
plt.title(f'{self.class_labels()[labels[i]]}')
plt.tight_layout()
plt.show()
def visualize_grid(self, num_samples=5):
num_rows = (num_samples + 4) // 5 # Calculate the number of rows needed
plt.figure(figsize=(10, 8))
for images, labels in self.shuffled_data.take(1):
for row in range(num_rows):
for i in range(5):
index = row * 5 + i
if index >= len(images):
break # No more images to display in this row
plt.subplot(num_rows, 5, row * 5 + i + 1)
plt.imshow(images[index] / 255.0)
plt.title(f'{self.class_labels()[labels[index]]}')
plt.tight_layout()
plt.show()
def visualize_class_distribution(self):
plt.figure(figsize=(10, 6))
# Extract labels and class names
labels = self.labels
class_names = self.class_names
# Count occurrences of each class label
class_counts = {class_name: np.sum(labels == idx) for idx, class_name in enumerate(class_names)}
# Generate a unique color for each class
colors = plt.cm.get_cmap('tab20', len(class_names))
# Create a bar graph with different colors for each class
plt.bar(class_counts.keys(), class_counts.values(), color=[colors(i) for i in range(len(class_names))])
plt.xlabel('Class Label')
plt.ylabel('Count')
plt.title('Class Distribution')
# Add a legend for class names
handles = [plt.Rectangle((0,0),1,1, color=colors(i), ec="k", label=class_name) for i, class_name in enumerate(class_names)]
plt.legend(handles=handles, title='Class Names', loc='upper right')
plt.show()
class HiddenPrints:
def __enter__(self):
self._original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout.close()
sys.stdout = self._original_stdout
class CustomModel:
# Constructor for CustomModel, initializes model according to parameters passed
def __init__(self, model_name, data_loader, weights ="imagenet", include_top = False):
self.model_name = model_name
self.weights = weights
self.include_top = include_top
self.data = data_loader.data
self.class_names = data_loader.class_names
self.model = self.initialize_model()
self.features = None
self.data_loader = data_loader
# Initialize a model with custom parameters like model type, weights etc.
def initialize_model(self):
# ResNet50 model
if self.model_name == 'resnet50':
base_model = ResNet50(weights = self.weights, include_top = self.include_top)
output = GlobalAveragePooling2D()(base_model.output)
model = Model(inputs= base_model.input, outputs=output)
self.data = self.data.map(lambda x,y : (resnet_preprocess_input(x),y))
return model
# VGG 16 model
if self.model_name == 'vgg19':
base_model = VGG19(weights=self.weights, include_top=self.include_top)
intermediate_layer_name = 'block5_pool'
output = GlobalAveragePooling2D()(base_model.output)
# model = Model(inputs=base_model.input, outputs=base_model.get_layer(intermediate_layer_name).output)
model = Model(inputs=base_model.input, outputs=output)
self.data = self.data.map(lambda x,y : (vgg_preprocess_input(x),y))
return model
if self.model_name == 'efficientnet':
base_model = EfficientNetB0(weights= self.weights, include_top=self.include_top)
output = GlobalAveragePooling2D()(base_model.output)
model = Model(inputs = base_model.input, outputs = output)
self.data = self.data.map(lambda x,y : (efficientnet_preprocess_input(x),y))
return model
# For extracting features from custom model, common for all models
def extract_features(self):
features = self.model.predict(self.data)
self.features = features
print(f"{self.model_name} extracted feature space size :", self.features.shape)
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from bokeh.models import ColumnDataSource, CustomJSHover, HoverTool
from bokeh.plotting import output_file, save
import umap
from sklearn.cluster import SpectralClustering, KMeans
import hdbscan
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score,homogeneity_completeness_v_measure
class FeatureSpaceVisualizer:
def __init__(self, model, reduction_method = 'tsne'):
self.original_feature_space = model.features
self.data = model.data
self.model_name = model.model_name
self.class_names = model.class_names
self.reduction_method = reduction_method
self.low_dim_feature_space = None
self.extrinsic_metrics = {}
self.initialize_visualizer()
self.data_loader = model.data_loader
def initialize_visualizer(self):
if self.reduction_method == 'tsne':
tsne = TSNE(n_components = 2, random_state = 42)
self.low_dim_feature_space = tsne.fit_transform(self.original_feature_space)
print(f'TSNE reduced {self.model_name} feature space shape :', self.low_dim_feature_space.shape)
if self.reduction_method == 'mds':
mds = MDS(n_components=2)
self.low_dim_feature_space = mds.fit_transform(self.original_feature_space)
if self.reduction_method == 'umap':
self.low_dim_feature_space = umap.UMAP().fit_transform(self.original_feature_space)
print(f'UMAP reduced {self.model_name} feature space shape :', self.low_dim_feature_space.shape)
def visualize(self):
labels = []
for images, label_batch in self.data:
labels.extend(label_batch.numpy())
labels = np.array(labels)
# Create a scatter plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(self.low_dim_feature_space[:, 0], self.low_dim_feature_space[:, 1], c=labels, cmap='viridis')
class_names = self.class_names
# Create a dictionary to map class labels to unique colors
unique_labels = np.unique(labels)
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_labels)))
class_color_mapping = {label: color for label, color in zip(unique_labels, colors)}
# Add legend with custom class names and colors
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=class_names[label], markerfacecolor=color, markersize=10) for label, color in class_color_mapping.items()]
plt.legend(handles=legend_elements, loc='upper right')
plt.title(f't-SNE Visualization of {self.model_name} features')
plt.xlabel(f'{self.model_name} t-SNE Dimension 1')
plt.ylabel(f'{self.model_name} t-SNE Dimension 2')
plt.savefig(f'{self.model_name}.png', dpi=300)
plt.show()
def spectral_clustering(self):
n_clusters = len(self.class_names)
spectral_clustering = SpectralClustering(n_clusters=n_clusters, random_state=42)
spectral_cluster_labels = spectral_clustering.fit_predict(self.low_dim_feature_space)
plt.figure(figsize=(8, 6))
plt.scatter(self.low_dim_feature_space[:, 0], self.low_dim_feature_space[:, 1], c=spectral_cluster_labels, cmap='viridis', s=50)
plt.title('Spectral Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
def supervised_metrics(self,true_labels, cluster_labels, title):
ari = adjusted_rand_score(true_labels, cluster_labels)
nmi = normalized_mutual_info_score(true_labels, cluster_labels)
fm = fowlkes_mallows_score(true_labels, cluster_labels)
vm = homogeneity_completeness_v_measure(true_labels, cluster_labels)
print(f"{title} Adjusted Rand Index (ARI -1-1): {ari:.2f}")
print(f"{title} Normalized Mutual Information (NMI 0-1): {nmi:.2f}")
print(f"{title} Fowlkes-Mallows Score (0-1): {fm:.2f}")
print(f"{title} Vmeasure (0-1): {vm[2]:.2f}")
extrinsic_metrics = {}
extrinsic_metrics[f'{title}ari'] = ari
extrinsic_metrics[f'{title}nmi'] = nmi
extrinsic_metrics[f'{title}fm'] = fm
extrinsic_metrics[f'{title}vm'] = vm[2]
return extrinsic_metrics
def kmeans_clustering(self):
n_clusters = len(self.class_names)
kmeans_clustering = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_clustering.fit_predict(self.original_feature_space)
kmeans_metrics = self.supervised_metrics(self.data_loader.labels, kmeans_clustering.labels_, 'KMeans')
plt.figure(figsize=(8, 6))
plt.scatter(self.low_dim_feature_space[:, 0], self.low_dim_feature_space[:, 1], c=kmeans_clustering.labels_, cmap='viridis', s=50)
plt.title('KMeans Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
return kmeans_metrics
def hdbscan_clustering(self):
HDB = hdbscan.HDBSCAN(min_cluster_size=5)
HDB.fit(self.original_feature_space)
extrinsic_metrics = self.supervised_metrics(self.data_loader.labels, HDB.labels_, 'HDBSCAN')
plt.figure(figsize=(8, 6))
# Scatter plot each data point with a color corresponding to its cluster
plt.scatter(self.low_dim_feature_space[:, 0], self.low_dim_feature_space[:, 1], c=HDB.labels_, cmap='viridis', s=50)
plt.title('HDBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
return extrinsic_metrics
def visualize_bokeh(self, save=False):
title = f"TSNE visualization for {self.model_name}"
data = self.low_dim_feature_space
data_loader = self.data_loader
#Extract class labels:
class_labels = data_loader.labels
class_names = data_loader.class_labels()
# Create image paths for bokeh
image_paths = [str(pathlib.Path.cwd() / image_path) for image_path in data_loader.data.file_paths]
image_paths_with_file_scheme = ['file:///' + path for path in image_paths]
#Store file names
file_names = [str(os.path.split(path)[1]) for path in image_paths]
# Define a color palette based on the number of unique class labels
unique_labels = list(set(class_labels))
num_classes = len(unique_labels)
if num_classes<3:
colors = ['#1f77b4', '#ff7f0e']
else:
colors = Category10[num_classes] # You can choose a different palette if needed
# Map class labels to colors
color_mapping = {label: colors[i] for i, label in enumerate(unique_labels)}
point_colors = [color_mapping[label] for label in class_labels]
# Map numerical labels to class names
class_names_mapping = {i: class_name for i, class_name in enumerate(class_names)}
label_names = [class_names_mapping[label] for label in class_labels]
# Create a Bokeh ColumnDataSource with image data
source = ColumnDataSource(data=dict(
x=data[:, 0],
y=data[:, 1],
imgs=image_paths_with_file_scheme, # Store image filenames for tooltips
labels=label_names,
fnames = file_names,
colors=point_colors, # Store point colors
))
# Create a new Bokeh figure for the scatter plot
p = figure(title=title, toolbar_location='right', tools="pan,box_zoom,reset,wheel_zoom")
# <div>
# <span style="font-size: 10px; font-weight: bold;">File:@imgs</span>
# </div>
# Define the tooltip template
tooltip_template = """
<div>
<div>
<span style="font-size: 14px; font-weight: bold;">Label: </span>
<span style="font-size: 14px;">@labels</span>
</div>
<div>
<span style="font-size: 10px; font-weight: bold;">File:@imgs</span>
</div>
<div>
<img src="@imgs" alt="" width="200" height="200">
</div>
</div>
"""
# Add tooltips using the template
hover = HoverTool(tooltips=tooltip_template)
# Add the hover tool to the plot
p.add_tools(hover)
# Create a legend and legend items
legend_items = []
for class_label, class_color in color_mapping.items():
class_indices = [i for i, label in enumerate(class_labels) if label == class_label]
class_source = ColumnDataSource(data=dict(
x=[data[i, 0] for i in class_indices],
y=[data[i, 1] for i in class_indices],
imgs=[image_paths_with_file_scheme[i] for i in class_indices],
labels=[label_names[i] for i in class_indices],
colors=[class_color] * len(class_indices)
))
scatter = p.scatter('x', 'y', source=class_source, size=8, color='colors', alpha=0.5, legend_label=class_names_mapping[class_label])
legend_items.append(LegendItem(label=class_names_mapping[class_label], renderers=[scatter]))
legend = Legend(items=legend_items)
p.add_layout(legend)
return p
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from s_dbw import S_Dbw
from sklearn.neighbors import LocalOutlierFactor
import hdbscan
class Metrics:
def __init__(self, feature_space, data_loader, decimal_places = 3):
self.feature_space = feature_space
self.labels = data_loader.labels
self.decimal_places = decimal_places
self.metrics = None
self.calculate()
def calculate(self):
features = self.feature_space
labels = self.labels
# # of outliers
LOF = LocalOutlierFactor(n_neighbors = int(np.sqrt(len(labels))))
outliers = LOF.fit_predict(features)
num_outliers = len(np.where(outliers==-1)[0])
# Intrinsic metrics
silhouette = silhouette_score(features, labels)
davies_bouldin_index = davies_bouldin_score(features, labels)
calinski_harabasz_index = calinski_harabasz_score(features, labels)
s_dbw = S_Dbw(features, labels, centers_id=None, method='Tong', alg_noise='bind', centr='mean', nearest_centr=True, metric='euclidean')
metrics = {'silhouette' : silhouette, 'DBI':davies_bouldin_index, 'CH':calinski_harabasz_index, 'sdbw':s_dbw, 'outliers':num_outliers}
print('# of outliers : {:.{dp}f}'.format(metrics['outliers'], dp=self.decimal_places))
print('Silhouette score : {:.{dp}f}'.format(metrics['silhouette'], dp=self.decimal_places))
print('Davies Bouldin Index *: {:.{dp}f}'.format(metrics['DBI'], dp=self.decimal_places))
print('Calinski Harabasz Index: {:.{dp}f}'.format(metrics['CH'], dp=self.decimal_places))
print('S_Dbw *: {:.{dp}f}'.format(metrics['sdbw'], dp=self.decimal_places))
self.metrics = metrics
return metrics
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
confusion_matrix,
)
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.layers import Dropout
from keras.optimizers import Adam
class ModelEvaluator:
def __init__(self, model, data_loader, feature_visualizer, test_size=0.30, verbose=False):
self.verbose = verbose
self.model_name = model.model_name
self.features = model.features
self.labels= data_loader.labels
self.low_dim_features = feature_visualizer.low_dim_feature_space
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
self.features, self.labels, test_size=test_size, random_state=42
)
def evaluate_svm(self):
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(self.X_train, self.y_train)
svm_predictions = svm_classifier.predict(self.X_test)
if self.verbose:
self.print_metrics("SVM Classifier Metrics:", svm_predictions)
else:
self.print_short_metrics("SVM Classifier", svm_predictions)
def evaluate_knn(self):
n_neighbors = int(np.sqrt(len(self.X_train)))
knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
knn_classifier.fit(self.X_train, self.y_train)
knn_predictions = knn_classifier.predict(self.X_test)
if self.verbose:
self.print_metrics("k-NN Classifier Metrics:", knn_predictions)
else:
self.print_short_metrics("k-NN Classifier", knn_predictions)
def build_nn(self):
optimizer = Adam(learning_rate=0.001)
num_classes = len(np.unique(self.labels))
print('num claasees :',num_classes)
# Define a neural network model for multi-class classification
model = Sequential()
model.add(Dense(512, input_dim=self.X_train.shape[1], activation='relu'))
# model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax')) # Use 'softmax' for multi-class
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
return model
def evaluate_nn(self):
y_train_categorical = to_categorical(self.y_train)
y_test_categorical = to_categorical(self.y_test)
model = self.build_nn()
# Train the model
model.fit(self.X_train, y_train_categorical, epochs=30, batch_size=32, verbose=1)
# Make predictions on the test data
nn_pred_onehot = model.predict(self.X_test)
nn_pred = np.argmax(nn_pred_onehot, axis=1) # Convert one-hot to class labels
if self.verbose:
self.print_metrics("Neural Network Classifier Metrics:", nn_pred)
else:
self.print_short_metrics("Neural Network Classifier", nn_pred)
return model
def evaluate_random_forest(self, n_estimators=100):
rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
rf_classifier.fit(self.X_train, self.y_train)
rf_predictions = rf_classifier.predict(self.X_test)
if self.verbose:
self.print_metrics("Random Forest Classifier Metrics:", rf_predictions)
else:
self.print_short_metrics("Random Forest Classifier", rf_predictions)
def print_metrics(self, title, predictions):
accuracy = accuracy_score(self.y_test, predictions)
precision = precision_score(self.y_test, predictions, average='weighted')
recall = recall_score(self.y_test, predictions, average='weighted')
f1 = f1_score(self.y_test, predictions, average='weighted')
confusion = confusion_matrix(self.y_test, predictions)
print(title)
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")
print("Confusion Matrix:\n", confusion,'\n')
def print_short_metrics(self, title, predictions):
f1 = f1_score(self.y_test, predictions, average='weighted')
print(f"{title} F1 Score: {f1*100:.2f}%")
def evaluate(self):
print(f'-----------------------{self.model_name}----------------------------')
self.evaluate_nn()
# Evaluate SVM Classifier
self.evaluate_svm()
# Evaluate k-NN Classifier
self.evaluate_knn()
# Evaluate Random Forest Classifier
self.evaluate_random_forest()
def evaluate_with_cross_validation(self, n_splits=10):
ml_performances ={}
# Initialize cross-validation
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Evaluate SVM Classifier with cross-validation
svm_classifier = SVC(kernel='linear', C=1.0)
svm_scores = cross_val_score(svm_classifier, self.features, self.labels, cv=cv, scoring='accuracy')
print("SVM Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(svm_scores) * 100))
# Evaluate k-NN Classifier with cross-validation
knn_classifier = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(self.X_train))))
knn_scores = cross_val_score(knn_classifier, self.features, self.labels, cv=cv, scoring='accuracy')
print("k-NN Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(knn_scores) * 100))
# Evaluate Random Forest Classifier with cross-validation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf_classifier, self.features, self.labels, cv=cv, scoring='accuracy')
print("Random Forest Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(rf_scores) * 100))
# # Evaluate NN Classifier with cross-validation
nn_scores = []
for train_index, val_index in cv.split(self.features, self.labels):
X = self.features
y = self.labels
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
# Define and compile your neural network model
model = self.build_nn()
# Convert labels to one-hot encoding
y_train_categorical = to_categorical(y_train)
y_val_categorical = to_categorical(y_val)
# Train the model
model.fit(X_train, y_train_categorical, epochs=10, batch_size=32, verbose=0)
# Make predictions on the validation set
y_val_pred = model.predict(X_val)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)
# Calculate accuracy and store it in the list
accuracy = accuracy_score(y_val, y_val_pred_classes)
nn_scores.append(accuracy)
print("Neural network Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(nn_scores) * 100))
ml_performances['rf'] = np.mean(rf_scores) * 100
ml_performances['knn'] = np.mean(knn_scores) * 100
ml_performances['svm'] = np.mean(svm_scores) * 100
ml_performances['nn'] = np.mean(nn_scores) * 100
return ml_performances
def evaluate_low_dim_with_cross_validation(self, n_splits=10):
# Initialize cross-validation
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Evaluate SVM Classifier with cross-validation
svm_classifier = SVC(kernel='linear', C=1.0)
svm_scores = cross_val_score(svm_classifier, self.low_dim_features, self.labels, cv=cv, scoring='accuracy')
print("SVM Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(svm_scores) * 100))
# Evaluate k-NN Classifier with cross-validation
knn_classifier = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(self.X_train))))
knn_scores = cross_val_score(knn_classifier, self.low_dim_features, self.labels, cv=cv, scoring='accuracy')
print("k-NN Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(knn_scores) * 100))
# Evaluate Random Forest Classifier with cross-validation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf_classifier, self.low_dim_features, self.labels, cv=cv, scoring='accuracy')
print("Random Forest Classifier Cross-Validation Accuracy: {:.2f}%".format(np.mean(rf_scores) * 100))
In [3]:
# print(f'\n\n -------------------For dataset{dataset.dataset_path}-------------------- \n\n\n\n')
# resnet = CustomModel(model_name='resnet50', data_loader = dataset)
# resnet.extract_features()
# resnet_tsne = FeatureSpaceVisualizer(resnet,reduction_method='tsne' )
# resnet_tsne.visualize()
# extrinsic_metrics = resnet_tsne.kmeans_clustering()
# print(extrinsic_metrics)
# resnet_metrics = Metrics(feature_space = resnet.features, data_loader = dataset)
# # #Evaluate the dataset
# # resnet_evaluator = ModelEvaluator(resnet, dataset, resnet_tsne, test_size = 0.3)
# # ml_scores = resnet_evaluator.evaluate_with_cross_validation()
# # # Populating results fro this dataset
# # datasets[dataset].append(resnet_metrics.metrics)
# # datasets[dataset]. append(resnet_tsne.extrinsic_metrics)
# # datasets[dataset]. append(ml_scores)
# # datasets[dataset]. append(extrinsic_metrics)
Setting up pipeline¶
Importing datasets¶
In [157]:
# Loading the datasets
mattendichtung_loader = DataLoader(f"C:/Users/hdn7rng\Desktop\Experiments\datasets\Mattendichtung")
asqmm_loader = DataLoader(f"C:/Users/hdn7rng/Desktop/Experiments/datasets/ASQMM_CUSTOM")
peg_loader = DataLoader(f"C:/Users/hdn7rng/Desktop/Experiments/datasets/PEG")
nut_loader = DataLoader(f"C:/Users/hdn7rng/Desktop/Experiments/datasets/metal_nutold")
screw_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/screw")
bottle_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/bottle")
cable_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/cable")
capsule_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/capsule")
# flowers_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/organic/flowers")
catdog_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/organic/catdog")
mnist_loader = DataLoader( f"C:/Users/hdn7rng/Desktop/Experiments/datasets/organic/mnist")
Found 5600 files belonging to 4 classes. Found 2916 files belonging to 2 classes. Found 1000 files belonging to 2 classes. Found 358 files belonging to 2 classes. Found 480 files belonging to 6 classes. Found 292 files belonging to 4 classes. Found 374 files belonging to 9 classes. Found 351 files belonging to 6 classes. Found 2000 files belonging to 2 classes. Found 600 files belonging to 10 classes.
In [158]:
# Add datasets to the pipeline
datasets = {}
datasets.update({ catdog_loader:[], asqmm_loader : [], peg_loader : [], mattendichtung_loader : [], nut_loader : [], screw_loader:[], bottle_loader : [], capsule_loader:[], cable_loader:[], mnist_loader:[]})
efficientnet_datasets = {}
efficientnet_datasets.update({ catdog_loader:[], asqmm_loader : [], peg_loader : [], mattendichtung_loader : [], nut_loader : [], screw_loader:[], bottle_loader : [], capsule_loader:[], cable_loader:[], mnist_loader:[]})
# datasets.update({ mnist_loader : []})
#
In [154]:
len(datasets)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In[154], line 1 ----> 1 datasets[0] KeyError: 0
In [73]:
%%time
run = 1
# For each dataset
for dataset in datasets:
print(f'\n\n -------------------For dataset{dataset.dataset_path}-------------------- \n\n\n\n')
resnet = CustomModel(model_name='resnet50', data_loader = dataset)
resnet.extract_features()
resnet_tsne = FeatureSpaceVisualizer(resnet,reduction_method='tsne' )
resnet_tsne.visualize()
kmeans_metrics = resnet_tsne.kmeans_clustering()
hdb_metrics = resnet_tsne.hdbscan_clustering()
resnet_metrics = Metrics(feature_space = resnet.features, data_loader = dataset)
#Evaluate the dataset
resnet_evaluator = ModelEvaluator(resnet, dataset, resnet_tsne, test_size = 0.3)
ml_scores = resnet_evaluator.evaluate_with_cross_validation()
# Populating results fro this dataset
datasets[dataset].append(resnet_metrics.metrics)
datasets[dataset]. append(kmeans_metrics)
datasets[dataset]. append(ml_scores)
datasets[dataset]. append(hdb_metrics)
-------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\organic\catdog-------------------- 63/63 [==============================] - 5s 73ms/step resnet50 extracted feature space size : (2000, 2048) TSNE reduced resnet50 feature space shape : (2000, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.89 KMeans Normalized Mutual Information (NMI 0-1): 0.83 KMeans Fowlkes-Mallows Score (0-1): 0.94 KMeans Vmeasure (0-1): 0.83
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.10 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.24 HDBSCAN Fowlkes-Mallows Score (0-1): 0.60 HDBSCAN Vmeasure (0-1): 0.24
# of outliers : 21.000 Silhouette score : 0.090 Davies Bouldin Index *: 3.154 Calinski Harabasz Index: 192.674 S_Dbw *: 0.954 SVM Classifier Cross-Validation Accuracy: 98.85% k-NN Classifier Cross-Validation Accuracy: 98.20% Random Forest Classifier Cross-Validation Accuracy: 98.15% num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 4ms/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 1ms/step Neural network Classifier Cross-Validation Accuracy: 98.85% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\ASQMM_CUSTOM-------------------- 92/92 [==============================] - 7s 71ms/step resnet50 extracted feature space size : (2916, 2048) TSNE reduced resnet50 feature space shape : (2916, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.87 KMeans Normalized Mutual Information (NMI 0-1): 0.75 KMeans Fowlkes-Mallows Score (0-1): 0.96 KMeans Vmeasure (0-1): 0.75
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.90 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.77 HDBSCAN Fowlkes-Mallows Score (0-1): 0.96 HDBSCAN Vmeasure (0-1): 0.77
# of outliers : 25.000 Silhouette score : 0.402 Davies Bouldin Index *: 1.114 Calinski Harabasz Index: 1594.104 S_Dbw *: 0.893 SVM Classifier Cross-Validation Accuracy: 99.42% k-NN Classifier Cross-Validation Accuracy: 97.02% Random Forest Classifier Cross-Validation Accuracy: 98.56% num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 3ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step Neural network Classifier Cross-Validation Accuracy: 98.83% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\PEG-------------------- 32/32 [==============================] - 3s 74ms/step resnet50 extracted feature space size : (1000, 2048) TSNE reduced resnet50 feature space shape : (1000, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=4. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.03 KMeans Normalized Mutual Information (NMI 0-1): 0.02 KMeans Fowlkes-Mallows Score (0-1): 0.53 KMeans Vmeasure (0-1): 0.02
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.16 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.16 HDBSCAN Fowlkes-Mallows Score (0-1): 0.57 HDBSCAN Vmeasure (0-1): 0.16
# of outliers : 0.000 Silhouette score : 0.053 Davies Bouldin Index *: 4.216 Calinski Harabasz Index: 53.845 S_Dbw *: 0.974 SVM Classifier Cross-Validation Accuracy: 97.40% k-NN Classifier Cross-Validation Accuracy: 92.60% Random Forest Classifier Cross-Validation Accuracy: 95.30% num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 92.80% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\Mattendichtung-------------------- 175/175 [==============================] - 13s 68ms/step resnet50 extracted feature space size : (5600, 2048) TSNE reduced resnet50 feature space shape : (5600, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.98 KMeans Normalized Mutual Information (NMI 0-1): 0.97 KMeans Fowlkes-Mallows Score (0-1): 0.99 KMeans Vmeasure (0-1): 0.97
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.94 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.92 HDBSCAN Fowlkes-Mallows Score (0-1): 0.96 HDBSCAN Vmeasure (0-1): 0.92
# of outliers : 43.000 Silhouette score : 0.317 Davies Bouldin Index *: 1.402 Calinski Harabasz Index: 1910.041 S_Dbw *: 0.700 SVM Classifier Cross-Validation Accuracy: 99.46% k-NN Classifier Cross-Validation Accuracy: 99.43% Random Forest Classifier Cross-Validation Accuracy: 99.61% num claasees : 4 18/18 [==============================] - 0s 916us/step num claasees : 4 18/18 [==============================] - 0s 3ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 1ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 1ms/step num claasees : 4 18/18 [==============================] - 0s 924us/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 896us/step num claasees : 4 18/18 [==============================] - 0s 3ms/step Neural network Classifier Cross-Validation Accuracy: 99.50% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\metal_nutold-------------------- 12/12 [==============================] - 2s 97ms/step resnet50 extracted feature space size : (358, 2048) TSNE reduced resnet50 feature space shape : (358, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.32 KMeans Normalized Mutual Information (NMI 0-1): 0.33 KMeans Fowlkes-Mallows Score (0-1): 0.77 KMeans Vmeasure (0-1): 0.33
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.32 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.33 HDBSCAN Fowlkes-Mallows Score (0-1): 0.77 HDBSCAN Vmeasure (0-1): 0.33
# of outliers : 0.000 Silhouette score : 0.191 Davies Bouldin Index *: 2.351 Calinski Harabasz Index: 61.988 S_Dbw *: 1.059 SVM Classifier Cross-Validation Accuracy: 93.03% k-NN Classifier Cross-Validation Accuracy: 80.75% Random Forest Classifier Cross-Validation Accuracy: 88.01% num claasees : 2 2/2 [==============================] - 0s 3ms/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 16ms/step num claasees : 2 2/2 [==============================] - 0s 2ms/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 87.71% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\screw-------------------- 15/15 [==============================] - 2s 88ms/step resnet50 extracted feature space size : (480, 2048) TSNE reduced resnet50 feature space shape : (480, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): -0.01 KMeans Normalized Mutual Information (NMI 0-1): 0.02 KMeans Fowlkes-Mallows Score (0-1): 0.33 KMeans Vmeasure (0-1): 0.02
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.13 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.07 HDBSCAN Fowlkes-Mallows Score (0-1): 0.58 HDBSCAN Vmeasure (0-1): 0.07
# of outliers : 0.000 Silhouette score : -0.043 Davies Bouldin Index *: 7.523 Calinski Harabasz Index: 1.741 S_Dbw *: 0.993 SVM Classifier Cross-Validation Accuracy: 81.67% k-NN Classifier Cross-Validation Accuracy: 75.21% Random Forest Classifier Cross-Validation Accuracy: 76.04% num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 3ms/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 77.92% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\bottle-------------------- 10/10 [==============================] - 2s 99ms/step resnet50 extracted feature space size : (292, 2048) TSNE reduced resnet50 feature space shape : (292, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.46 KMeans Normalized Mutual Information (NMI 0-1): 0.43 KMeans Fowlkes-Mallows Score (0-1): 0.76 KMeans Vmeasure (0-1): 0.43
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.00 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.00 HDBSCAN Fowlkes-Mallows Score (0-1): 0.79 HDBSCAN Vmeasure (0-1): 0.00
# of outliers : 16.000 Silhouette score : 0.193 Davies Bouldin Index *: 2.347 Calinski Harabasz Index: 29.315 S_Dbw *: 1.170 SVM Classifier Cross-Validation Accuracy: 93.14% k-NN Classifier Cross-Validation Accuracy: 87.66% Random Forest Classifier Cross-Validation Accuracy: 92.80% num claasees : 4 1/1 [==============================] - 0s 62ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 40ms/step num claasees : 4 1/1 [==============================] - 0s 58ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 45ms/step num claasees : 4 1/1 [==============================] - 0s 53ms/step num claasees : 4 1/1 [==============================] - 0s 51ms/step Neural network Classifier Cross-Validation Accuracy: 92.11% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\capsule-------------------- 11/11 [==============================] - 2s 109ms/step resnet50 extracted feature space size : (351, 2048) TSNE reduced resnet50 feature space shape : (351, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.07 KMeans Normalized Mutual Information (NMI 0-1): 0.11 KMeans Fowlkes-Mallows Score (0-1): 0.37 KMeans Vmeasure (0-1): 0.11
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.09 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.12 HDBSCAN Fowlkes-Mallows Score (0-1): 0.52 HDBSCAN Vmeasure (0-1): 0.12
# of outliers : 19.000 Silhouette score : -0.023 Davies Bouldin Index *: 4.687 Calinski Harabasz Index: 5.242 S_Dbw *: 1.080 SVM Classifier Cross-Validation Accuracy: 81.77% k-NN Classifier Cross-Validation Accuracy: 70.37% Random Forest Classifier Cross-Validation Accuracy: 74.37% num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 71.80% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\cable-------------------- 12/12 [==============================] - 2s 98ms/step resnet50 extracted feature space size : (374, 2048) TSNE reduced resnet50 feature space shape : (374, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.05 KMeans Normalized Mutual Information (NMI 0-1): 0.19 KMeans Fowlkes-Mallows Score (0-1): 0.33 KMeans Vmeasure (0-1): 0.19
HDBSCAN Adjusted Rand Index (ARI -1-1): -0.18 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.06 HDBSCAN Fowlkes-Mallows Score (0-1): 0.56 HDBSCAN Vmeasure (0-1): 0.06
# of outliers : 4.000 Silhouette score : -0.022 Davies Bouldin Index *: 3.799 Calinski Harabasz Index: 5.990 S_Dbw *: 0.988 SVM Classifier Cross-Validation Accuracy: 87.40% k-NN Classifier Cross-Validation Accuracy: 77.80% Random Forest Classifier Cross-Validation Accuracy: 80.47% num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 82.86% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\organic\mnist-------------------- 19/19 [==============================] - 2s 78ms/step resnet50 extracted feature space size : (600, 2048) TSNE reduced resnet50 feature space shape : (600, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.32 KMeans Normalized Mutual Information (NMI 0-1): 0.48 KMeans Fowlkes-Mallows Score (0-1): 0.39 KMeans Vmeasure (0-1): 0.48
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.06 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.17 HDBSCAN Fowlkes-Mallows Score (0-1): 0.27 HDBSCAN Vmeasure (0-1): 0.17
# of outliers : 0.000 Silhouette score : 0.065 Davies Bouldin Index *: 3.213 Calinski Harabasz Index: 30.452 S_Dbw *: 0.826 SVM Classifier Cross-Validation Accuracy: 92.17% k-NN Classifier Cross-Validation Accuracy: 83.33% Random Forest Classifier Cross-Validation Accuracy: 86.83% num claasees : 10 2/2 [==============================] - 0s 6ms/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 16ms/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 16ms/step num claasees : 10 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 87.33% CPU times: total: 32min 47s Wall time: 17min 48s
In [159]:
%%time
run = 12
# For each dataset
for dataset in datasets:
print(f'\n\n -------------------For dataset{dataset.dataset_path}-------------------- \n\n\n\n')
efficientnet = CustomModel(model_name='efficientnet', data_loader = dataset)
efficientnet.extract_features()
efficientnet_tsne = FeatureSpaceVisualizer(efficientnet,reduction_method='tsne' )
efficientnet_tsne.visualize()
kmeans_metrics = efficientnet_tsne.kmeans_clustering()
hdb_metrics = efficientnet_tsne.hdbscan_clustering()
efficientnet_metrics = Metrics(feature_space = efficientnet.features, data_loader = dataset)
#Evaluate the dataset
efficientnet_evaluator = ModelEvaluator(efficientnet, dataset, efficientnet_tsne, test_size = 0.3)
ml_scores = efficientnet_evaluator.evaluate_with_cross_validation()
# Populating results fro this dataset
efficientnet_datasets[dataset].append(efficientnet_metrics.metrics)
efficientnet_datasets[dataset]. append(kmeans_metrics)
efficientnet_datasets[dataset]. append(ml_scores)
efficientnet_datasets[dataset]. append(hdb_metrics)
-------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\organic\catdog-------------------- 63/63 [==============================] - 4s 56ms/step efficientnet extracted feature space size : (2000, 1280) TSNE reduced efficientnet feature space shape : (2000, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.97 KMeans Normalized Mutual Information (NMI 0-1): 0.94 KMeans Fowlkes-Mallows Score (0-1): 0.99 KMeans Vmeasure (0-1): 0.94
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.32 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.44 HDBSCAN Fowlkes-Mallows Score (0-1): 0.61 HDBSCAN Vmeasure (0-1): 0.44
# of outliers : 2.000 Silhouette score : 0.108 Davies Bouldin Index *: 2.829 Calinski Harabasz Index: 244.216 S_Dbw *: 0.943 SVM Classifier Cross-Validation Accuracy: 99.05% k-NN Classifier Cross-Validation Accuracy: 99.30% Random Forest Classifier Cross-Validation Accuracy: 99.20% num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 0s/step num claasees : 2 7/7 [==============================] - 0s 3ms/step num claasees : 2 7/7 [==============================] - 0s 3ms/step Neural network Classifier Cross-Validation Accuracy: 99.05% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\ASQMM_CUSTOM-------------------- 92/92 [==============================] - 6s 54ms/step efficientnet extracted feature space size : (2916, 1280) TSNE reduced efficientnet feature space shape : (2916, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.86 KMeans Normalized Mutual Information (NMI 0-1): 0.75 KMeans Fowlkes-Mallows Score (0-1): 0.96 KMeans Vmeasure (0-1): 0.75
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.88 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.75 HDBSCAN Fowlkes-Mallows Score (0-1): 0.96 HDBSCAN Vmeasure (0-1): 0.75
# of outliers : 23.000 Silhouette score : 0.416 Davies Bouldin Index *: 1.049 Calinski Harabasz Index: 1756.474 S_Dbw *: 0.857 SVM Classifier Cross-Validation Accuracy: 99.28% k-NN Classifier Cross-Validation Accuracy: 96.95% Random Forest Classifier Cross-Validation Accuracy: 97.77% num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 4ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 2ms/step num claasees : 2 10/10 [==============================] - 0s 3ms/step Neural network Classifier Cross-Validation Accuracy: 99.11% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\PEG-------------------- 32/32 [==============================] - 3s 61ms/step efficientnet extracted feature space size : (1000, 1280) TSNE reduced efficientnet feature space shape : (1000, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=4. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.04 KMeans Normalized Mutual Information (NMI 0-1): 0.03 KMeans Fowlkes-Mallows Score (0-1): 0.52 KMeans Vmeasure (0-1): 0.03
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.14 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.11 HDBSCAN Fowlkes-Mallows Score (0-1): 0.57 HDBSCAN Vmeasure (0-1): 0.11
# of outliers : 1.000 Silhouette score : 0.087 Davies Bouldin Index *: 3.253 Calinski Harabasz Index: 88.029 S_Dbw *: 0.959 SVM Classifier Cross-Validation Accuracy: 97.90% k-NN Classifier Cross-Validation Accuracy: 94.10% Random Forest Classifier Cross-Validation Accuracy: 95.60% num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 5ms/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 5ms/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 5ms/step num claasees : 2 4/4 [==============================] - 0s 5ms/step num claasees : 2 4/4 [==============================] - 0s 0s/step num claasees : 2 4/4 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 95.50% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\Mattendichtung-------------------- 175/175 [==============================] - 10s 53ms/step efficientnet extracted feature space size : (5600, 1280) TSNE reduced efficientnet feature space shape : (5600, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
KMeans Adjusted Rand Index (ARI -1-1): 0.98 KMeans Normalized Mutual Information (NMI 0-1): 0.96 KMeans Fowlkes-Mallows Score (0-1): 0.98 KMeans Vmeasure (0-1): 0.96
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.95 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.93 HDBSCAN Fowlkes-Mallows Score (0-1): 0.96 HDBSCAN Vmeasure (0-1): 0.93
# of outliers : 117.000 Silhouette score : 0.370 Davies Bouldin Index *: 1.157 Calinski Harabasz Index: 3292.280 S_Dbw *: 0.600 SVM Classifier Cross-Validation Accuracy: 99.54% k-NN Classifier Cross-Validation Accuracy: 99.43% Random Forest Classifier Cross-Validation Accuracy: 99.59% num claasees : 4 18/18 [==============================] - 0s 1ms/step num claasees : 4 18/18 [==============================] - 0s 916us/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 1ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 2ms/step num claasees : 4 18/18 [==============================] - 0s 916us/step num claasees : 4 18/18 [==============================] - 0s 2ms/step Neural network Classifier Cross-Validation Accuracy: 99.57% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\metal_nutold-------------------- 12/12 [==============================] - 2s 73ms/step efficientnet extracted feature space size : (358, 1280) TSNE reduced efficientnet feature space shape : (358, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.32 KMeans Normalized Mutual Information (NMI 0-1): 0.33 KMeans Fowlkes-Mallows Score (0-1): 0.77 KMeans Vmeasure (0-1): 0.33
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.32 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.33 HDBSCAN Fowlkes-Mallows Score (0-1): 0.77 HDBSCAN Vmeasure (0-1): 0.33
# of outliers : 1.000 Silhouette score : 0.157 Davies Bouldin Index *: 2.555 Calinski Harabasz Index: 50.548 S_Dbw *: 1.021 SVM Classifier Cross-Validation Accuracy: 93.31% k-NN Classifier Cross-Validation Accuracy: 82.71% Random Forest Classifier Cross-Validation Accuracy: 90.25% num claasees : 2 2/2 [==============================] - 0s 16ms/step num claasees : 2 2/2 [==============================] - 0s 16ms/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 16ms/step num claasees : 2 2/2 [==============================] - 0s 16ms/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step num claasees : 2 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 91.09% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\screw-------------------- 15/15 [==============================] - 2s 67ms/step efficientnet extracted feature space size : (480, 1280) TSNE reduced efficientnet feature space shape : (480, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): -0.01 KMeans Normalized Mutual Information (NMI 0-1): 0.02 KMeans Fowlkes-Mallows Score (0-1): 0.34 KMeans Vmeasure (0-1): 0.02
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.12 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.07 HDBSCAN Fowlkes-Mallows Score (0-1): 0.62 HDBSCAN Vmeasure (0-1): 0.07
# of outliers : 0.000 Silhouette score : -0.027 Davies Bouldin Index *: 6.509 Calinski Harabasz Index: 2.289 S_Dbw *: 0.962 SVM Classifier Cross-Validation Accuracy: 86.46% k-NN Classifier Cross-Validation Accuracy: 75.42% Random Forest Classifier Cross-Validation Accuracy: 77.92% num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 80.62% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\bottle-------------------- 10/10 [==============================] - 2s 74ms/step efficientnet extracted feature space size : (292, 1280) TSNE reduced efficientnet feature space shape : (292, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.26 KMeans Normalized Mutual Information (NMI 0-1): 0.38 KMeans Fowlkes-Mallows Score (0-1): 0.61 KMeans Vmeasure (0-1): 0.38
HDBSCAN Adjusted Rand Index (ARI -1-1): -0.04 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.18 HDBSCAN Fowlkes-Mallows Score (0-1): 0.54 HDBSCAN Vmeasure (0-1): 0.18
# of outliers : 22.000 Silhouette score : 0.240 Davies Bouldin Index *: 2.314 Calinski Harabasz Index: 32.687 S_Dbw *: 1.210 SVM Classifier Cross-Validation Accuracy: 94.52% k-NN Classifier Cross-Validation Accuracy: 87.99% Random Forest Classifier Cross-Validation Accuracy: 94.51% num claasees : 4 1/1 [==============================] - 0s 52ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 63ms/step num claasees : 4 1/1 [==============================] - 0s 65ms/step num claasees : 4 1/1 [==============================] - 0s 63ms/step num claasees : 4 1/1 [==============================] - 0s 47ms/step num claasees : 4 1/1 [==============================] - 0s 53ms/step num claasees : 4 1/1 [==============================] - 0s 63ms/step Neural network Classifier Cross-Validation Accuracy: 93.85% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\capsule-------------------- 11/11 [==============================] - 2s 75ms/step efficientnet extracted feature space size : (351, 1280) TSNE reduced efficientnet feature space shape : (351, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.01 KMeans Normalized Mutual Information (NMI 0-1): 0.08 KMeans Fowlkes-Mallows Score (0-1): 0.33 KMeans Vmeasure (0-1): 0.08
HDBSCAN Adjusted Rand Index (ARI -1-1): -0.10 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.12 HDBSCAN Fowlkes-Mallows Score (0-1): 0.40 HDBSCAN Vmeasure (0-1): 0.12
# of outliers : 21.000 Silhouette score : 0.013 Davies Bouldin Index *: 4.394 Calinski Harabasz Index: 6.650 S_Dbw *: 1.093 SVM Classifier Cross-Validation Accuracy: 84.36% k-NN Classifier Cross-Validation Accuracy: 69.52% Random Forest Classifier Cross-Validation Accuracy: 75.23% num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 0s/step num claasees : 6 2/2 [==============================] - 0s 16ms/step num claasees : 6 2/2 [==============================] - 0s 18ms/step Neural network Classifier Cross-Validation Accuracy: 76.06% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\cable-------------------- 12/12 [==============================] - 2s 93ms/step efficientnet extracted feature space size : (374, 1280) TSNE reduced efficientnet feature space shape : (374, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.05 KMeans Normalized Mutual Information (NMI 0-1): 0.23 KMeans Fowlkes-Mallows Score (0-1): 0.32 KMeans Vmeasure (0-1): 0.23
HDBSCAN Adjusted Rand Index (ARI -1-1): -0.12 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.04 HDBSCAN Fowlkes-Mallows Score (0-1): 0.65 HDBSCAN Vmeasure (0-1): 0.04
# of outliers : 2.000 Silhouette score : 0.009 Davies Bouldin Index *: 3.678 Calinski Harabasz Index: 6.614 S_Dbw *: 0.998 SVM Classifier Cross-Validation Accuracy: 87.70% k-NN Classifier Cross-Validation Accuracy: 78.88% Random Forest Classifier Cross-Validation Accuracy: 81.80% num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 16ms/step num claasees : 9 2/2 [==============================] - 0s 6ms/step num claasees : 9 2/2 [==============================] - 0s 16ms/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 16ms/step num claasees : 9 2/2 [==============================] - 0s 0s/step num claasees : 9 2/2 [==============================] - 0s 0s/step Neural network Classifier Cross-Validation Accuracy: 85.28% -------------------For datasetC:\Users\hdn7rng\Desktop\Experiments\datasets\organic\mnist-------------------- 19/19 [==============================] - 2s 70ms/step efficientnet extracted feature space size : (600, 1280) TSNE reduced efficientnet feature space shape : (600, 2)
C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\hdn7rng\.conda\envs\tf\lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
KMeans Adjusted Rand Index (ARI -1-1): 0.30 KMeans Normalized Mutual Information (NMI 0-1): 0.44 KMeans Fowlkes-Mallows Score (0-1): 0.37 KMeans Vmeasure (0-1): 0.44
HDBSCAN Adjusted Rand Index (ARI -1-1): 0.02 HDBSCAN Normalized Mutual Information (NMI 0-1): 0.14 HDBSCAN Fowlkes-Mallows Score (0-1): 0.26 HDBSCAN Vmeasure (0-1): 0.14
# of outliers : 0.000 Silhouette score : 0.051 Davies Bouldin Index *: 3.541 Calinski Harabasz Index: 24.931 S_Dbw *: 0.850 SVM Classifier Cross-Validation Accuracy: 91.00% k-NN Classifier Cross-Validation Accuracy: 78.00% Random Forest Classifier Cross-Validation Accuracy: 84.00% num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 16ms/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 0s/step num claasees : 10 2/2 [==============================] - 0s 16ms/step Neural network Classifier Cross-Validation Accuracy: 84.83% CPU times: total: 28min 39s Wall time: 16min 1s
In [132]:
len(datasets_resnet)
Out[132]:
10
In [160]:
len(efficientnet_datasets)
Out[160]:
10
Storing results¶
In [161]:
import matplotlib.pyplot as plt
import numpy as np
# Extract DBI, Silhouette, CH, SDBW, values
dbi_values = []
silhouette_values = []
ch_values = []
sdbw_values = []
# Extract ML scores
svm_values = []
nn_values = []
knn_values = []
rf_values = []
# HDBSCANExtrinsic metrics
HDBSCANari_values = []
HDBSCANnmi_values = []
HDBSCANfm_values = []
HDBSCANvm_values = []
# KMeansExtrinsic metrics
KMeansari_values = []
KMeansnmi_values = []
KMeansfm_values = []
KMeansvm_values = []
for key, value in efficientnet_datasets.items():
dbi = value[0]['DBI']
silhouette = value[0]['silhouette']
ch = value[0]['CH']
sdbw = value[0]['sdbw']
svm = value[2]['svm']
nn = value[2]['nn']
rf = value[2]['rf']
knn = value[2]['knn']
# for HDBSCAN
HDBSCANari = value[3]['HDBSCANari']
HDBSCANnmi = value[3]['HDBSCANnmi']
HDBSCANfm = value[3]['HDBSCANfm']
HDBSCANvm = value[3]['HDBSCANvm']
# for kmeans
KMeansari = value[1]['KMeansari']
KMeansnmi = value[1]['KMeansnmi']
KMeansfm = value[1]['KMeansfm']
KMeansvm = value[1]['KMeansvm']
dbi_values.append(dbi)
silhouette_values.append(silhouette)
ch_values.append(ch)
sdbw_values.append(sdbw)
svm_values.append(svm)
nn_values.append(nn)
rf_values.append(rf)
knn_values.append(knn)
# For HDBSCAN
HDBSCANvm_values.append(HDBSCANvm)
HDBSCANfm_values.append(HDBSCANfm)
HDBSCANnmi_values.append(HDBSCANnmi)
HDBSCANari_values.append(HDBSCANari)
# For HDBSCAN
KMeansvm_values.append(KMeansvm)
KMeansfm_values.append(KMeansfm)
KMeansnmi_values.append(KMeansnmi)
KMeansari_values.append(KMeansari)
resnet values¶
In [ ]:
In [202]:
import matplotlib.pyplot as plt
import numpy as np
# Extract DBI, Silhouette, CH, SDBW, values
resnet_dbi_values = []
resnet_silhouette_values = []
resnet_ch_values = []
resnet_sdbw_values = []
# Extract ML scores
resnet_svm_values = []
resnet_nn_values = []
resnet_knn_values = []
resnet_rf_values = []
# HDBSCANExtrinsic metrics
resnet_HDBSCANari_values = []
resnet_HDBSCANnmi_values = []
resnet_HDBSCANfm_values = []
resnet_HDBSCANvm_values = []
# KMeansExtrinsic metrics
resnet_KMeansari_values = []
resnet_KMeansnmi_values = []
resnet_KMeansfm_values = []
resnet_KMeansvm_values = []
for key, value in datasets_resnet.items():
dbi = value[0]['DBI']
silhouette = value[0]['silhouette']
ch = value[0]['CH']
sdbw = value[0]['sdbw']
svm = value[2]['svm']
nn = value[2]['nn']
rf = value[2]['rf']
knn = value[2]['knn']
# for HDBSCAN
HDBSCANari = value[3]['HDBSCANari']
HDBSCANnmi = value[3]['HDBSCANnmi']
HDBSCANfm = value[3]['HDBSCANfm']
HDBSCANvm = value[3]['HDBSCANvm']
# for kmeans
KMeansari = value[1]['KMeansari']
KMeansnmi = value[1]['KMeansnmi']
KMeansfm = value[1]['KMeansfm']
KMeansvm = value[1]['KMeansvm']
resnet_dbi_values.append(dbi)
resnet_silhouette_values.append(silhouette)
resnet_ch_values.append(ch)
resnet_sdbw_values.append(sdbw)
resnet_svm_values.append(svm)
resnet_nn_values.append(nn)
resnet_rf_values.append(rf)
resnet_knn_values.append(knn)
# For HDBSCAN
resnet_HDBSCANvm_values.append(HDBSCANvm)
resnet_HDBSCANfm_values.append(HDBSCANfm)
resnet_HDBSCANnmi_values.append(HDBSCANnmi)
resnet_HDBSCANari_values.append(HDBSCANari)
# For HDBSCAN
resnet_KMeansvm_values.append(KMeansvm)
resnet_KMeansfm_values.append(KMeansfm)
resnet_KMeansnmi_values.append(KMeansnmi)
resnet_KMeansari_values.append(KMeansari)
Plotting ML accuracy results¶
In [163]:
# Create scatter plots with trend lines
plt.figure(figsize=(24, 6))
# DBI vs SVM
plt.subplot(141)
plt.scatter(dbi_values, svm_values, c='b', marker='o')
plt.xlabel('DBI')
plt.ylabel('SVM accuracy (%)')
plt.title('DBI vs SVM')
dbi_fit = np.polyfit(dbi_values, svm_values, 1)
plt.plot(dbi_values, np.polyval(dbi_fit, dbi_values), 'b--', label='Trend Line')
plt.legend()
# Silhouette vs SVM
plt.subplot(142)
plt.scatter(silhouette_values, svm_values, c='g', marker='o')
plt.xlabel('Silhouette')
plt.ylabel('SVM accuracy (%)')
plt.title('Silhouette vs SVM')
silhouette_fit = np.polyfit(silhouette_values, svm_values, 1)
plt.plot(silhouette_values, np.polyval(silhouette_fit, silhouette_values), 'g--', label='Trend Line')
plt.legend()
# CH vs SVM
plt.subplot(143)
plt.scatter(ch_values, svm_values, c='r', marker='o')
plt.xlabel('CH')
plt.ylabel('SVM accuracy (%)')
plt.title('CH vs SVM')
ch_fit = np.polyfit(ch_values, svm_values, 1)
plt.plot(ch_values, np.polyval(ch_fit, ch_values), 'r--', label='Trend Line')
plt.legend()
# SDBW vs SVM
plt.subplot(144)
plt.scatter(sdbw_values, svm_values, c='y', marker='o')
plt.xlabel('SDBW')
plt.ylabel('SVM accuracy (%)')
plt.title('SDBW vs SVM')
sdbw_fit = np.polyfit(sdbw_values, svm_values, 1)
plt.plot(sdbw_values, np.polyval(sdbw_fit, sdbw_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('svm_vs_metrics', dpi=300)
plt.show()
In [164]:
# Create scatter plots with trend lines for NN
plt.figure(figsize=(24, 6))
# DBI vs NN
plt.subplot(141)
plt.scatter(dbi_values, nn_values, c='b', marker='o')
plt.xlabel('DBI')
plt.ylabel('NN accuracy (%)')
plt.title('DBI vs NN')
dbi_fit = np.polyfit(dbi_values, nn_values, 1)
plt.plot(dbi_values, np.polyval(dbi_fit, dbi_values), 'b--', label='Trend Line')
plt.legend()
# Silhouette vs NN
plt.subplot(142)
plt.scatter(silhouette_values, nn_values, c='g', marker='o')
plt.xlabel('Silhouette')
plt.ylabel('NN accuracy (%)')
plt.title('Silhouette vs NN')
silhouette_fit = np.polyfit(silhouette_values, nn_values, 1)
plt.plot(silhouette_values, np.polyval(silhouette_fit, silhouette_values), 'g--', label='Trend Line')
plt.legend()
# CH vs NN
plt.subplot(143)
plt.scatter(ch_values, nn_values, c='r', marker='o')
plt.xlabel('CH')
plt.ylabel('NN accuracy (%)')
plt.title('CH vs NN')
ch_fit = np.polyfit(ch_values, nn_values, 1)
plt.plot(ch_values, np.polyval(ch_fit, ch_values), 'r--', label='Trend Line')
plt.legend()
# SDBW vs NN
plt.subplot(144)
plt.scatter(sdbw_values, nn_values, c='y', marker='o')
plt.xlabel('SDBW')
plt.ylabel('NN accuracy (%)')
plt.title('SDBW vs NN')
sdbw_fit = np.polyfit(sdbw_values, nn_values, 1)
plt.plot(sdbw_values, np.polyval(sdbw_fit, sdbw_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('nn_vs_metrics', dpi=300)
plt.show()
In [165]:
# Create scatter plots with trend lines for RF
plt.figure(figsize=(24, 6))
# DBI vs RF
plt.subplot(141)
plt.scatter(dbi_values, rf_values, c='b', marker='o')
plt.xlabel('DBI')
plt.ylabel('RF accuracy (%)')
plt.title('DBI vs RF')
dbi_fit = np.polyfit(dbi_values, rf_values, 1)
plt.plot(dbi_values, np.polyval(dbi_fit, dbi_values), 'b--', label='Trend Line')
plt.legend()
# Silhouette vs RF
plt.subplot(142)
plt.scatter(silhouette_values, rf_values, c='g', marker='o')
plt.xlabel('Silhouette')
plt.ylabel('RF accuracy (%)')
plt.title('Silhouette vs RF')
silhouette_fit = np.polyfit(silhouette_values, rf_values, 1)
plt.plot(silhouette_values, np.polyval(silhouette_fit, silhouette_values), 'g--', label='Trend Line')
plt.legend()
# CH vs RF
plt.subplot(143)
plt.scatter(ch_values, rf_values, c='r', marker='o')
plt.xlabel('CH')
plt.ylabel('RF accuracy (%)')
plt.title('CH vs RF')
ch_fit = np.polyfit(ch_values, rf_values, 1)
plt.plot(ch_values, np.polyval(ch_fit, ch_values), 'r--', label='Trend Line')
plt.legend()
# SDBW vs RF
plt.subplot(144)
plt.scatter(sdbw_values, rf_values, c='y', marker='o')
plt.xlabel('SDBW')
plt.ylabel('RF accuracy (%)')
plt.title('SDBW vs RF')
sdbw_fit = np.polyfit(sdbw_values, rf_values, 1)
plt.plot(sdbw_values, np.polyval(sdbw_fit, sdbw_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('dbi_vs_metrics', dpi=300)
plt.show()
In [166]:
# Create scatter plots with trend lines for KNN
plt.figure(figsize=(24, 6))
# DBI vs KNN
plt.subplot(141)
plt.scatter(dbi_values, knn_values, c='b', marker='o')
plt.xlabel('DBI')
plt.ylabel('KNN accuracy (%)')
plt.title('DBI vs KNN')
dbi_fit = np.polyfit(dbi_values, knn_values, 1)
plt.plot(dbi_values, np.polyval(dbi_fit, dbi_values), 'b--', label='Trend Line')
plt.legend()
# Silhouette vs KNN
plt.subplot(142)
plt.scatter(silhouette_values, knn_values, c='g', marker='o')
plt.xlabel('Silhouette')
plt.ylabel('KNN accuracy (%)')
plt.title('Silhouette vs KNN')
silhouette_fit = np.polyfit(silhouette_values, knn_values, 1)
plt.plot(silhouette_values, np.polyval(silhouette_fit, silhouette_values), 'g--', label='Trend Line')
plt.legend()
# CH vs KNN
plt.subplot(143)
plt.scatter(ch_values, knn_values, c='r', marker='o')
plt.xlabel('CH')
plt.ylabel('KNN accuracy (%)')
plt.title('CH vs KNN')
ch_fit = np.polyfit(ch_values, knn_values, 1)
plt.plot(ch_values, np.polyval(ch_fit, ch_values), 'r--', label='Trend Line')
plt.legend()
# SDBW vs KNN
plt.subplot(144)
plt.scatter(sdbw_values, knn_values, c='y', marker='o')
plt.xlabel('SDBW')
plt.ylabel('KNN accuracy (%)')
plt.title('SDBW vs KNN')
sdbw_fit = np.polyfit(sdbw_values, knn_values, 1)
plt.plot(sdbw_values, np.polyval(sdbw_fit, sdbw_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('knn_vs_metrics', dpi=300)
plt.show()
Plotting correlation coefficients¶
In [167]:
# Calculate SVM correlation coefficients
correlation_dbi_svm = np.corrcoef(dbi_values, svm_values)[0, 1]
correlation_silhouette_svm = np.corrcoef(silhouette_values, svm_values)[0, 1]
correlation_ch_svm = np.corrcoef(ch_values, svm_values)[0, 1]
correlation_sdbw_svm = np.corrcoef(sdbw_values, svm_values)[0, 1]
print(f"Correlation DBI vs SVM: {correlation_dbi_svm}")
print(f"Correlation Silhouette vs SVM: {correlation_silhouette_svm}")
print(f"Correlation CH vs SVM: {correlation_ch_svm}")
print(f"Correlation SDBW vs SVM: {correlation_sdbw_svm}")
# Calculate nn correlation coefficients
correlation_dbi_nn = np.corrcoef(dbi_values, nn_values)[0, 1]
correlation_silhouette_nn = np.corrcoef(silhouette_values, nn_values)[0, 1]
correlation_ch_nn = np.corrcoef(ch_values, nn_values)[0, 1]
correlation_sdbw_nn = np.corrcoef(sdbw_values, nn_values)[0, 1]
print(f"Correlation DBI vs NN: {correlation_dbi_nn}")
print(f"Correlation Silhouette vs NN: {correlation_silhouette_nn}")
print(f"Correlation CH vs NN: {correlation_ch_nn}")
print(f"Correlation SDBW vs N:N {correlation_sdbw_nn}")
# Calculate rf correlation coefficients
correlation_dbi_rf = np.corrcoef(dbi_values, rf_values)[0, 1]
correlation_silhouette_rf = np.corrcoef(silhouette_values, rf_values)[0, 1]
correlation_ch_rf = np.corrcoef(ch_values, rf_values)[0, 1]
correlation_sdbw_rf = np.corrcoef(sdbw_values, rf_values)[0, 1]
print(f"Correlation DBI vs rf: {correlation_dbi_rf}")
print(f"Correlation Silhouette vs rf: {correlation_silhouette_rf}")
print(f"Correlation CH vs rf: {correlation_ch_rf}")
print(f"Correlation SDBW vs rf {correlation_sdbw_rf}")
# Calculate knn correlation coefficients
correlation_dbi_knn = np.corrcoef(dbi_values, knn_values)[0, 1]
correlation_silhouette_knn = np.corrcoef(silhouette_values, knn_values)[0, 1]
correlation_ch_knn = np.corrcoef(ch_values, knn_values)[0, 1]
correlation_sdbw_knn = np.corrcoef(sdbw_values, knn_values)[0, 1]
print(f"Correlation DBI vs knn: {correlation_dbi_knn}")
print(f"Correlation Silhouette vs knn: {correlation_silhouette_knn}")
print(f"Correlation CH vs knn: {correlation_ch_knn}")
print(f"Correlation SDBW vs knn {correlation_sdbw_knn}")
Correlation DBI vs SVM: -0.798167740763308 Correlation Silhouette vs SVM: 0.7593030065912305 Correlation CH vs SVM: 0.5753005166994785 Correlation SDBW vs SVM: -0.45841777210158086 Correlation DBI vs NN: -0.8136248441591379 Correlation Silhouette vs NN: 0.7725360057820152 Correlation CH vs NN: 0.5693771051365971 Correlation SDBW vs N:N -0.4048873655068264 Correlation DBI vs rf: -0.8191437551835935 Correlation Silhouette vs rf: 0.7688587524353746 Correlation CH vs rf: 0.5504489804979751 Correlation SDBW vs rf -0.3712995108129758 Correlation DBI vs knn: -0.7462832734835978 Correlation Silhouette vs knn: 0.7281508696141904 Correlation CH vs knn: 0.6103429605699575 Correlation SDBW vs knn -0.4734902235873424
HDBSCAN Extrinsic metrics¶
In [121]:
# HDB metrics
# Create scatter plots with trend lines
plt.figure(figsize=(24, 6))
# ARI vs SVM
plt.subplot(141)
plt.scatter(HDBSCANari_values, svm_values, c='b', marker='o')
plt.xlabel('Adjusted Rand Index')
plt.ylabel('SVM accuracy (%)')
plt.title('HDBSCAN ARI vs SVM')
HDBSCANari_fit = np.polyfit(HDBSCANari_values, svm_values, 1)
plt.plot(HDBSCANari_values, np.polyval(HDBSCANari_fit, HDBSCANari_values), 'b--', label='Trend Line')
plt.legend()
# nmi vs SVM
plt.subplot(142)
plt.scatter(HDBSCANnmi_values, svm_values, c='g', marker='o')
plt.xlabel('Normalized Mutual Index')
plt.ylabel('SVM accuracy (%)')
plt.title('HBSCAN NMI vs SVM')
HDBSCANnmi_fit = np.polyfit(HDBSCANnmi_values, svm_values, 1)
plt.plot(HDBSCANnmi_values, np.polyval(HDBSCANnmi_fit, HDBSCANnmi_values), 'g--', label='Trend Line')
plt.legend()
# HDBSCANfm vs SVM
plt.subplot(143)
plt.scatter(HDBSCANfm_values, svm_values, c='r', marker='o')
plt.xlabel('Fowlkes-Mallows')
plt.ylabel('SVM accuracy (%)')
plt.title('HDBSCAN FM vs SVM')
HDBSCANfm_fit = np.polyfit(HDBSCANfm_values, svm_values, 1)
plt.plot(HDBSCANfm_values, np.polyval(HDBSCANfm_fit, HDBSCANfm_values), 'r--', label='Trend Line')
plt.legend()
# HDBSCANvm vs SVM
plt.subplot(144)
plt.scatter(HDBSCANvm_values, svm_values, c='y', marker='o')
plt.xlabel('V-measure')
plt.ylabel('SVM accuracy (%)')
plt.title('HDBSCAN VM vs SVM')
HDBSCANvm_fit = np.polyfit(HDBSCANvm_values, svm_values, 1)
plt.plot(HDBSCANvm_values, np.polyval(HDBSCANvm_fit, HDBSCANvm_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('HDBSCAN_metrics', dpi=300)
plt.show()
KMEANS metrics¶
In [122]:
# HDB metrics
# Create scatter plots with trend lines
plt.figure(figsize=(24, 6))
# ARI vs SVM
plt.subplot(141)
plt.scatter(KMeansari_values, svm_values, c='b', marker='o')
plt.xlabel('Adjusted Rand Index')
plt.ylabel('SVM accuracy (%)')
plt.title('KMeans ARI vs SVM')
KMeansari_fit = np.polyfit(KMeansari_values, svm_values, 1)
plt.plot(KMeansari_values, np.polyval(KMeansari_fit, KMeansari_values), 'b--', label='Trend Line')
plt.legend()
# nmi vs SVM
plt.subplot(142)
plt.scatter(KMeansnmi_values, svm_values, c='g', marker='o')
plt.xlabel('Normalized Mutual Index')
plt.ylabel('SVM accuracy (%)')
plt.title('KMeans NMI vs SVM')
KMeansnmi_fit = np.polyfit(KMeansnmi_values, svm_values, 1)
plt.plot(KMeansnmi_values, np.polyval(KMeansnmi_fit, KMeansnmi_values), 'g--', label='Trend Line')
plt.legend()
# Kmeansfm vs SVM
plt.subplot(143)
plt.scatter(KMeansfm_values, svm_values, c='r', marker='o')
plt.xlabel('Fowlkes-Mallows')
plt.ylabel('SVM accuracy (%)')
plt.title('Kmeans FM vs SVM')
KMeansfm_fit = np.polyfit(KMeansfm_values, svm_values, 1)
plt.plot(KMeansfm_values, np.polyval(KMeansfm_fit, KMeansfm_values), 'r--', label='Trend Line')
plt.legend()
# Kmeansvm vs SVM
plt.subplot(144)
plt.scatter(KMeansvm_values, svm_values, c='y', marker='o')
plt.xlabel('V-measure')
plt.ylabel('SVM accuracy (%)')
plt.title('KMeans VM vs SVM')
KMeansvm_fit = np.polyfit(KMeansvm_values, svm_values, 1)
plt.plot(KMeansvm_values, np.polyval(KMeansvm_fit, KMeansvm_values), 'y--', label='Trend Line')
plt.legend()
plt.tight_layout()
plt.savefig('KMeans_metrics', dpi=300)
plt.show()
In [123]:
# Calculate DBSCAN correlation coefficients
correlation_HDBSCANari = np.corrcoef(HDBSCANari_values, svm_values)[0, 1]
correlation_HDBSCANnmi = np.corrcoef(HDBSCANnmi_values, svm_values)[0, 1]
correlation_HDBSCANfm = np.corrcoef(HDBSCANfm_values, svm_values)[0, 1]
correlation_HDBSCANvm = np.corrcoef(HDBSCANvm_values, svm_values)[0, 1]
print(f"Correlation DBSCAN vs ari: {correlation_HDBSCANari}")
print(f"Correlation DBSCAN vs nmi: {correlation_HDBSCANnmi}")
print(f"Correlation DBSCAN vs fm: {correlation_HDBSCANfm}")
print(f"Correlation DBSCAN vs vm: {correlation_HDBSCANvm}")
# Calculate KMeans correlation coefficients
correlation_KMeansari = np.corrcoef(KMeansari_values, svm_values)[0, 1]
correlation_KMeansnmi = np.corrcoef(KMeansnmi_values, svm_values)[0, 1]
correlation_KMeansfm = np.corrcoef(KMeansfm_values, svm_values)[0, 1]
correlation_KMeansvm = np.corrcoef(KMeansvm_values, svm_values)[0, 1]
print(f"CorrelationKMeans vs ari: {correlation_KMeansari}")
print(f"CorrelationKMeans vs nmi: {correlation_KMeansnmi}")
print(f"CorrelationKMeans vs fm: {correlation_KMeansfm}")
print(f"CorrelationKMeans vs vm: {correlation_KMeansvm}")
Correlation DBSCAN vs ari: 0.5695347540556673 Correlation DBSCAN vs nmi: 0.6273229488070051 Correlation DBSCAN vs fm: 0.49300322912035316 Correlation DBSCAN vs vm: 0.6273229488070051 CorrelationKMeans vs ari: 0.7701772796984184 CorrelationKMeans vs nmi: 0.7378874033350974 CorrelationKMeans vs fm: 0.8288608770481086 CorrelationKMeans vs vm: 0.7378874033350972
RESNET VS EFFICIENTNET¶
In [218]:
str(dataset.dataset_path).split('\\')[-1]
Out[218]:
'mnist'
In [ ]:
In [285]:
import matplotlib.pyplot as plt
dataset_names = []
for dataset in datasets_resnet:
dataset_names.append(str(dataset.dataset_path).split('\\')[-1])
def compare_nets(efficientnet_values, resnet_values, metric='Metric',save = 'False' ) :
datasets = ['CatDog','ASQMM','PEG','Mattendichtung','MetalNut','Screw','Bottle','Capsule','Cable','MNIST']
# Sample data for ResNet and EfficientNet metrics
# Create a line chart for accuracy comparison
plt.figure(figsize=(10, 6))
plt.plot(datasets,resnet_values , marker='o', label='ResNet50', linestyle='-', color='blue')
plt.plot(datasets, efficientnet_values, marker='s', label='EfficientNetB0', linestyle='--', color='green')
# Add labels and a legend
plt.xlabel('Datasets')
plt.ylabel(f'{metric}')
plt.title(f'{metric} comparison between ResNet50 and EfficientNetB0 (lower is better)')
plt.legend()
# Show the plot
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
if save:
plt.savefig(f'{metric}_resnet_vs_efficientnet.png', dpi=300)
plt.show()
metric = 'S_Dbw'
efficientnet_values = sdbw_values
resnet_values = resnet_sdbw_values
compare_nets(efficientnet_values , resnet_values, metric = metric, save =False)
In [266]:
for key, value in efficientnet_datasets.items():
print(value[0]['sdbw'])
0.9431370556193436 0.8565630287350994 0.9587223308921301 0.6000322771071182 1.0205056979817397 0.9622661682242984 1.2102381195394367 1.092981590351097 0.9977684455924868 0.8503743728543772
In [256]:
for key, value in datasets_resnet.items():
print(value[0]['silhouette'])
0.09028799 0.40238285 0.052925892 0.31745523 0.19109415 -0.04304799 0.19306804 -0.022640416 -0.02170907 0.06452677
Complex graph¶
In [417]:
def complex_graph(efficientnet_metric , resnet_metric, efficientnet_performance, resnet_performance, model='ML model', metric = metric, save =False):
# Sample data for ResNet and EfficientNet metrics
datasets = ['CatDog','ASQMM','PEG','Mattendichtung','MetalNut','Screw','Bottle','Capsule','Cable','MNIST']
# Create a figure and a set of axes
fig, ax1, = plt.subplots(figsize=(12, 6))
# Create the line chart on the primary y-axis (ax1)
plt.title(f'{metric} comparison between ResNet50 and EfficientNetB0 (higher is better)')
ax1.plot(datasets, resnet_metric, marker='o', label='ResNet50', linestyle='-', color='blue')
ax1.plot(datasets, efficientnet_metric, marker='s', label='EfficientNetB0', linestyle='--', color='green')
ax1.set_xlabel('Datasets')
ax1.set_ylabel(f'{metric}', color='black')
ax1.tick_params(axis='y', labelcolor='black')
ax1.legend(loc='lower left')
ax1.grid(True)
ax1.tick_params(axis='x', rotation=45)
bar_width = 0.16 # Adjust the bar width
num_datasets = np.arange(len(datasets))
# Create the bar chart on the secondary y-axis (ax2)
ax2 = ax1.twinx() # Create a twin axes sharing the same x-axis
ax2.bar(num_datasets-bar_width / 2, resnet_performance,width = bar_width, alpha=0.1, label=f'ResNet50', color='blue')
ax2.bar(num_datasets+bar_width / 2, efficientnet_performance,width = bar_width, alpha=0.1, label=f'EfficientnetB0', color='green')
ax2.set_ylabel('SVM accuracy', color='black')
ax2.tick_params(axis='y', labelcolor='black')
ax2.legend(loc='upper right')
# Set y-axis limits for accuracy
ax2.set_ylim([70, 100])
# Add a legend that combines both line and bar legends
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
# ax1.legend(lines + lines2, labels + labels2, loc='upper left')
# Adjust the spacing between the two y-axes
# plt.tight_layout()
if save:
plt.savefig(f'{metric}_complex_graph.png',bbox_inches='tight', dpi=300)
# Show the plot
plt.show()
metric = 'Silhouette score'
efficientnet_values = silhouette_values
resnet_values = resnet_silhouette_values
efficientnet_performance = svm_values
resnet_performance = resnet_svm_values
complex_graph(efficientnet_values, resnet_values, efficientnet_performance, resnet_performance, model = 'SVM', save = True)
In [318]:
efficientnet_performance
Out[318]:
[99.05, 99.2795273737231, 97.9, 99.53571428571429, 93.30952380952381, 86.45833333333334, 94.51724137931035, 84.35714285714285, 87.69559032716927, 90.99999999999999]
In [320]:
resnet_performance
Out[320]:
[98.85, 99.41674904674483, 97.39999999999999, 99.46428571428572, 93.03174603174604, 81.66666666666667, 93.13793103448275, 81.76984126984127, 87.40398293029872, 92.16666666666666]
In [324]:
import matplotlib.pyplot as plt
import numpy as np
# Sample data for ResNet and EfficientNet metrics
datasets = ["Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4", "Dataset 5"]
resnet_accuracy = [0.85, 0.88, 0.92, 0.78, 0.91]
efficientnet_accuracy = [0.87, 0.89, 0.91, 0.75, 0.92]
resnet_speed = [10, 12, 15, 8, 9] # Sample speed data for the first bar chart
efficientnet_speed = [8, 9, 10, 11, 12] # Sample speed data for the second bar chart
# Create a figure and a set of axes
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# Line chart 1 (top)
ax1.plot(datasets, resnet_accuracy, marker='o', label='ResNet Accuracy', linestyle='-', color='blue')
ax1.plot(datasets, efficientnet_accuracy, marker='s', label='EfficientNet Accuracy', linestyle='--', color='green')
ax1.set_xlabel('Datasets')
ax1.set_ylabel('Accuracy')
ax1.set_title('Accuracy Comparison')
ax1.legend()
ax1.grid(True)
ax1.tick_params(axis='x', rotation=45)
# Bar chart 1 (bottom-left)
ax2.bar(datasets, resnet_speed, alpha=0.5, label='ResNet Speed', color='orange')
ax2.bar(datasets, efficientnet_speed, alpha=0.5, label='EfficientNet Speed', color='red')
ax2.set_xlabel('Datasets')
ax2.set_ylabel('Speed')
ax2.set_title('Speed Comparison')
ax2.legend()
ax2.grid(True)
ax2.tick_params(axis='x', rotation=45)
# Adjust the layout and spacing
plt.tight_layout()
# Show the plot
plt.show()